# --------------------------------
# setup
# --------------------------------

# libraries
library(dplyr)
library(lubridate)
library(topicmodels)
library(ggplot2)
library(tidytext)

# --------------------------------
# load pre-trained topic models
# --------------------------------
tm <- readRDS("data/outputs/tm_165.rds")
document_dates <- readRDS("data/outputs/document_dates.rds")
document_dates <- document_dates %>% mutate(year = year(date), date_by_month = format(date, "%Y-%m"))


# --------------------------------
# functions
# --------------------------------
# function: feed seed words, finds top topic
find_top_topic_for_seeds <- function(tm, seeds, N){
  words_by_topic <- tidy(tm, matrix = "beta") %>%
    filter(term %in% seeds) %>%
    select(-term) %>%
    group_by(topic) %>%
    summarize(beta = mean(beta)) %>%
    ungroup() %>%
    top_n(N, wt = beta) %>%
    arrange(-beta)
  return(words_by_topic$topic)}

# --------------------------------
# document topic probabilities
# --------------------------------

# per document topic proabilities matrix (gamma)
doc_topics <- tidy(tm, matrix = "gamma") %>% arrange(document, -gamma) 

# merge with dates and average over date (year)
doc_topics <- merge(doc_topics, document_dates, by = "document") %>% filter(year < 2013)
doc_topics_avg <- doc_topics %>% 
  group_by(topic, year) %>% 
  summarize(gamma = mean(gamma), .groups = 'drop_last') %>% 
  ungroup() %>%
  arrange(year, -gamma)

# --------------------------------
# figure A.6 (b)
# --------------------------------

# top topic for each seed
pobreza_topic <- find_top_topic_for_seeds(tm = tm, seeds = 'pobreza', N = 3)
constitucion_topic <- find_top_topic_for_seeds(tm = tm, seeds = 'constituyente', N = 3)

fgA6b <- doc_topics %>% 
  filter(topic %in% c(pobreza_topic, constitucion_topic)) %>%
  select(topic, gamma, year) %>%
  mutate(topic = if_else(topic %in% pobreza_topic, "pobreza", "constituyente")) %>%
  group_by(year, topic) %>% summarize(gamma = mean(gamma), .groups = "drop_last") %>% ungroup() %>%
  ggplot(aes(x = year, y = gamma)) +
  geom_bar(stat="identity", color = "white", fill = "gray70") + 
  ylab('Theme probability \n (average gamma for top \n 3 topics associated to themes)') +
  xlab('Year') +
  facet_wrap(~topic, scales = "free_y") +
  scale_x_continuous(breaks = seq(1998, 2012, by = 1)) +
  expand_limits(x = 1998, y = 0) +
  scale_fill_manual(name  ="Topics", 
                    labels = c("Constituyente", "Pobreza"),
                    values=c("#bdbdbd", "#636363")) +
  theme(panel.background = element_blank(),
        axis.text.x = element_text(size=16, angle = 90, vjust = 0.5),
        axis.text.y = element_text(size=16),
        axis.title.y = element_text(size=18, margin = margin(t = 0, r = 15, b = 0, l = 15)),
        axis.title.x = element_blank(),
        strip.text = element_text(size = 18),
        legend.position = "none")

ggsave(filename = "fgA6b.pdf", plot = fgA6b, height = 12, width = 12, path = './figures/', dpi = 1000)

# --------------------------------
# figure A.6 (a)
# --------------------------------

# per topic word proabilities matrix (beta)
topic_term <- tidy(tm, matrix = "beta") 

# top words per topic
top_words_by_topic <- topic_term %>% 
  group_by(topic) %>% 
  top_n(10, wt = beta) %>% 
  ungroup() %>%
  arrange(topic, -beta)

# plot terms of closest topics
pobreza_topic_name <- tibble(topic = pobreza_topic, name = paste0("Pobreza-", seq(1,length(pobreza_topic),1)), theme = "Pobreza")
constitucion_topic_name <- tibble(topic = constitucion_topic, name = paste0("Constituyente-", seq(1,length(constitucion_topic),1)),theme = "Constituyente")
top_topics_names <- rbind(pobreza_topic_name, constitucion_topic_name) %>% mutate(plot_order = 1:nrow(.))

# plot A6.a
fgA6a <- top_words_by_topic %>% 
  filter(topic %in% unique(c(pobreza_topic, constitucion_topic))) %>%
  merge(top_topics_names, by = "topic") %>%
  ggplot(aes(reorder_within(term, beta, name), beta, fill = theme)) +
  geom_col(show.legend = FALSE) +
  scale_x_reordered() +
  scale_fill_grey() +
  facet_wrap(~ name, scales = "free") +
  coord_flip() +
  ylab('') +
  xlab('')

ggsave(filename = "fgA6a.pdf", plot = fgA6a, height = 12, width = 12, path = './figures/', dpi = 1000)



